Skip to content

Conversation

@addie9800
Copy link
Collaborator

No description provided.

@addie9800 addie9800 requested a review from MaxDall November 10, 2025 19:25
Comment on lines +62 to +97
class V1_1(V1):
VALID_UNTIL = datetime.date.today()

_paragraph_selector = XPath("//div[contains(@class, 'content')]/p[text()]")
_summary_selector = XPath("//h2")
_subheadline_selector = XPath("//div[contains(@class, 'content')]/p[not(text()) and strong]")

_topics_selector = XPath("(//ul[contains(@class, 'text-[#3D619B]')])[1]/li")

@attribute
def body(self) -> Optional[ArticleBody]:
return extract_article_body_with_selector(
self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
summary_selector=self._summary_selector,
subheadline_selector=self._subheadline_selector,
)

@attribute
def topics(self) -> List[str]:
return generic_topic_parsing(
strip_nodes_to_text(self._topics_selector(self.precomputed.doc), join_on=","),
substitution_pattern=re.compile(r"-\s*$"),
delimiter=",",
)

@attribute
def images(self) -> List[Image]:
return image_extraction(
doc=self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
upper_boundary_selector=CSSSelector("h1"),
lower_boundary_selector=XPath("(//img[@alt='Google Play'])[1]"),
image_selector=XPath("//div[@property='articleBody']//img[not(@fetchpriority='auto')]"),
author_selector=XPath("./ancestor::div[contains(@class,'relative') and (picture or img)]/div"),
)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That looks like it might be worth opening a new major version

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants